/* c_SIMS_panel_create.do - ****************************************************

	This file matches the NSC data to sims data. We do this using primary
	name and DOB from enrollment files.

******************************************************************************/

set more off

local full_merge = 1
local create_VAM = 1
local create_peers = 1

if `full_merge' == 1 {
foreach t in oct eoy {
	local fyr 2001

	local lyr 2017

	*Eoy indicates the end of the school year so we need to push the year
	*forward one
	if "`t'"=="eoy" {
		local fyr 2002

		local lyr 2018
	}

	forvalues y = `fyr'/`lyr' {
		local yyo = substr("`y'",-2,2)
		use "$raw_data_sims/sims`t'`yyo'.dta", clear

		cap rename md_lasid lasid
		cap rename md_gender gender

		*the october 2010 and 2011 files use a different date convention.
		if inlist(`y',2010,2011) & "`t'"=="oct" {
			gen DOB = date(dob,"DMY",2050)
		}
		else {
			gen DOB = date(dob,"MDY")
		}
		format DOB %td

		rename dob olddob
		rename DOB dob

		foreach x in fname mname lname {
			replace `x' = lower(`x')
		}


		*clean variable names
		cap rename lowinc low_inc
		cap rename ORG_CODE org_code
		cap rename truant truancy
		cap rename IN_SUSP in_susp
		cap rename OUT_SUSP out_susp
		cap rename CITY_OB city_ob
		cap rename LVL_NEED lvl_need
		cap rename ELP_PROG elp_prog
		cap rename elp_eng_lrners_pgm_code elp_prog
		cap gen low_inc = ""
		cap gen rfr = ""		// reason for reporting
		cap gen enstat = ""		// enrollment status
		cap gen rfe = "" 		// reason for enrollment
		cap gen member = .a		// number of days a member of school
		cap gen attend = .a		// number of days attended
		cap gen plan = ""		// High school completer plan
		cap gen in_susp = .
		cap gen out_susp = .
		cap gen truancy = .
		cap gen spedeval = ""
		destring member, replace force
		destring attend, replace force
		destring in_susp, replace force
		destring truancy, replace force
		tostring low_inc, replace force
		tostring enstat, replace force
		tostring rfe, replace force
		tostring plan, replace force
		tostring spedeval, replace force
		destring enstat, replace
		cap rename town_re town

		*Fix up Special education results for some years
		cap rename spd_sped_prmy_disb_code 	spd_prim_dis
		cap rename sns_sped_natr_svcs_code 	spd_natr_svcs
		cap rename sln_sped_lvl_need_code 	spd_lvl_need
		cap rename lvl_need					spd_lvl_need
		cap rename sped 					spd_placement
		cap rename sped_6to21				spd_placement
		cap rename sped_6to					spd_placement
		cap gen spd_lvl_need = ""

		*English language program variables
		cap gen elp_prog = "miss"
		gen ell_pgm = elp_prog != "00" if elp_prog != "miss"
		gen full_school = school

		*SIMS school codes in 2001-2002 are formatted differently (6 digits instead of 8)
		if (`y' <= 2002) | (`y'== 2003 & "`t'"=="eoy")  {
			gen temp_name = full_school
			qui replace temp_name = substr(temp_name, -3, .)
			destring temp_name, replace
			gen org_code_temp = org_code
			destring org_code_temp, replace
			replace temp_name = org_code_temp*10000+temp_name
			tostring temp_name, replace format(%08.0f)
			replace full_school = temp_name
			drop temp_name org_code_temp
		}

		*The last four digits are the school code. The other digits are location
		*code, but this is irrelevant due to our data only being BPS
		gen district = substr(full_school, 1, 4)
		gen len_school = length(full_school)
		replace school = substr(full_school, -4, len_school)
		keep sasid fname lname mname dob gender race org_code low_inc spd_lvl_need spd_placement ///
			rfr enstat rfe grade attend member plan in_susp out_susp truancy district school town city_ob ell_pgm lep full_school org_code

		*these are observed as missing values as one cannot go to school for 555 days
		*a year and there are quite a few students with this coding who also have no
		*other values
		drop if enstat == 7
		drop if attend == 555 | member == 555

		*Specify that the name is coming from SIMS data set
		foreach x in fname mname lname {
			rename `x' s`x'
		}

		* Save SIMS files before filtering to preK applicants
		save "$stata_data_sims/`y'/all_sim`t'.dta", replace

		*Merge with the cleaned SASID sample
		merge m:1 sasid using "$stata_data_crosswalks/SASID_Studentno.dta", keep(match)
		compress
		save "$stata_data_sims/`y'/sasid_sim`t'.dta", replace

	}
}
clear
foreach t in oct eoy {
	local fyr 2001
	local lyr 2017
	if "`t'"=="eoy" {
		local fyr 2002
		local lyr 2018
	}

	* Append cleaned SASID sample filtered to preK applicants
	forvalues y = `fyr'/`lyr' {
		append using "$stata_data_sims/`y'/sasid_sim`t'.dta"
		cap gen simyear = `y'
		replace simyear = `y' if missing(simyear)
		cap gen t = "`t'"
		replace t = "`t'" if missing(t)

	}

}

compress
save "$stata_data_sims/sasid_full_build.dta", replace

clear
foreach t in oct eoy {
	local fyr 2001
	local lyr 2017
	if "`t'"=="eoy" {
		local fyr 2002
		local lyr 2018
	}

	* Append cleaned SIMS files
	forvalues y = `fyr'/`lyr' {
		append using "$stata_data_sims/`y'/all_sim`t'.dta"
		cap gen simyear = `y'
		replace simyear = `y' if missing(simyear)
		cap gen t = "`t'"
		replace t = "`t'" if missing(t)

	}

}

compress
* Save full SIMS extract for creation of VA outcomes
save "$stata_data_sims/all_sasid_full_build.dta", replace

}

******************************Create VAM code***********************************
if `create_VAM' == 1 {
	do "$code/2_Analysis_Prep/create_VA_outcomes.do"
}

*************************Create Peer Characteristics****************************
if `create_peers' == 1 {
	do "$code/2_Analysis_Prep/create_peer_outcomes.do"
}

set more off
use "$stata_data_sims/sasid_full_build.dta", clear

*Create a variable that indicates the year a school year starts. If it's october
*the school year starts at the same year of observation. If it's eoy, we need
*to go back a year to determine year start.
gen simyear0 = simyear if t=="oct"
replace simyear0 = simyear-1 if t=="eoy"

*Create grade variables for prek and kindergarten
replace grade = "-1" if grade=="PK"
replace grade = "0" if inlist(grade,"KF","KP","KT")
destring grade, replace force
replace grade = .a if missing(grade)

*Indicate if the observation is from beginning of year or end of year
gen period = 0 if t=="oct"
replace period= 1 if t=="eoy"

*Rename variables to indicate their origins in the SIMs data
foreach x in race gender grade {
	rename `x' sim`x'
}

*Clean Age data
gen age = floor((mdy(6,1,simyear+1) - dob)/365)
gen yob = year(dob)
gen mob = month(dob)

*Ontime calculations. This tracks proper grade progression. This follows
*assumptions regarding when someone would start a year based on their month
*of birth
gen ontime = yob+5 + simgrade <= simyear0 if inrange(mob,1,8)
replace ontime = yob+6 + simgrade <= simyear0 if inrange(mob,9,12)

*Now the assignment version. This tracks ontime enrollment according to
*the year they applied to PreK
gen ontime_asgn = yr_asgn + simgrade + 1 <= simyear0

* Enrollment status variables
gen hsgrad = enstat==4
gen transferred = enstat == 2
gen enrolled = enstat == 1
gen has_enstat = enstat!=.

*Determine school type
gen charter = rfe=="03"
gen metco = rfe=="04"
gen dropout = inlist(enstat,30,31,32,33,34,35,36)
gen excl = enstat==5
gen incarcerated = inlist(full_school, "09200300","09200500")
gen days_incarc = member*inlist(full_school, "09200300","09200500")

gen exam = inlist(full_school, "00350560","00350545","00350575")
destring full_school, replace

*Special Education Results
gen spd_lowest_need = spd_lvl_need == "01" if spd_lvl_need != ""
gen spd_low_need = spd_lvl_need == "02" if spd_lvl_need != ""
gen spd_mod_need = spd_lvl_need == "03" if spd_lvl_need != ""
gen spd_high_need = spd_lvl_need == "04" if spd_lvl_need != ""
gen spd_no_need = spd_lvl_need == "500" if spd_lvl_need != ""
gen any_spd = spd_no_need != 1 if spd_lvl_need != ""

* Since 2001 and 2002 SIMS files do not have SPED Level of Need variable, use SPED placement
* variable instead
replace spd_no_need = spd_placement == "00" if inlist(simyear0, 2001, 2002)
replace any_spd = spd_no_need != 1 if spd_placement != "" & inlist(simyear0, 2001, 2002)

*find the last observed simgrade and the last associated enstat
bysort studentno has_enstat: egen last_simgrade_sub = max(simgrade) if has_enstat == 1
bysort studentno: egen last_simgrade = max(last_simgrade_sub)
drop last_simgrade_sub
gen isnt_last_simgrade = simgrade != last_simgrade
bysort studentno (isnt_last_simgrade): gen last_enstat = enstat[1]
drop isnt_last_simgrade last_enstat

*Track number absences and suspesions
gen num_absences = member - attend
replace out_susp = 0 if missing(out_susp)
replace in_susp = 0 if missing(in_susp)
gen anysusp = out_susp + in_susp

*Low income status
destring low_inc, replace
gen no_low = low_inc == 555 | low_inc == .
bysort studentno (no_low simgrade simyear0 period enstat -attend): gen base_inc = low_inc[1]
drop low_inc no_low
gen frpl_sim = base_inc == 1| base_inc == 2 if base_inc != .

sort studentno simgrade -member, stable
*Collapse on studentno simgrade
collapse (firstnm) rfe town sasid city_ob rfr full_school school ///
	(min) simyear0 age (max) final_simyear = simyear0 ///
	(max) has_enstat ell_pgm any_spd enrolled ontime_asgn incarcerated enstat transferred ///
	spd_lowest_need spd_low_need spd_mod_need spd_high_need spd_no_need ///
	charter base_inc exam metco dropout excl ///
	ontime hsgrad frpl_sim (sum) per_incarc = incarcerated days_incarc num_absences attend member truancy in_susp ///
	out_susp anysusp, by(studentno simgrade)

*Calculate attendance percentages
bys studentno : egen attsh = mean(attend/member)
gen attshby = attend/member

save "$stata_data_sims/SIMS_analysis_uniq.dta", replace

*Determine if a student ever repeated a grade. This occurs when the student is observed
*in school for multiple years.
use "$stata_data_sims/SIMS_analysis_uniq.dta", clear
gen repeat_grade_asgn = final_simyear - simyear0 > 0

*This is a hack to fill in missing observations to allow for reshape. If a student
*is missing a year, this block creates observations for the missing years and fills
*in all missing observations
preserve
keep studentno simgrade
collapse simgrade, by(studentno)
replace simgrade = 13
tempfile add_more
save `add_more'
restore
append using `add_more'
sort studentno simgrade
tsset studentno simgrade
tsfill
drop if simgrade > 12 | simgrade < -1

*Deal with missing sasid
bysort studentno (sasid): replace sasid = sasid[_N]

*Merge in school level characteristics
gen grade = simgrade
merge m:1 school grade using "$stata_data_sims/school_level.dta", keep(master match) nogen
merge m:1 school grade using "$stata_data_mcas/school_mcas.dta", keep(master match) nogen
drop grade

*Merge in VA outcomes by school grade
merge m:1 full_school simgrade using "$stata_data_sims/VAM_schl_gr_ests.dta", keep(master match) nogen

*Clean up some variables from tsfill
tostring simgrade, replace
replace simgrade = "PK" if simgrade == "-1"

*Cleanup frpl_sim from the tsfill
bysort studentno: egen frpl_sim_true = max(frpl_sim)
replace frpl_sim = frpl_sim_true
drop frpl_sim_true

*Reshape for regression analysis
reshape wide days_incarc age attsh has_enstat base_inc attshby rfe town city_ob rfr school full_school ///
	spd_lowest_need spd_low_need spd_mod_need spd_high_need spd_no_need ///
	enstat transferred truancy in_susp out_susp ontime_asgn ///
	repeat_grade_asgn num_absences enrolled any_spd per_incarc ///
	anysusp exam incarcerated charter VAMtest VAMtest_to_BPS wgt_bps_VAMtest_avg  ///
  metco dropout excl simyear0 ///
	ontime hsgrad attend member final_simyear ell_pgm sch_total_susp sch_truancy ///
	sch_std_e sch_std_m sch_std_scaled_m sch_std_scaled_e , i(studentno) j(simgrade) string

*Create VAM averages for each student
egen VAMtest_avg = rowmean(VAMtest*)
egen VAMtest_to_BPS_avg = rowmean(VAMtest_to_BPS*)

*Merge in school-level peer characteristics
merge m:1 studentno using "$stata_data_sims/school_level_peer_chars.dta", keep(master match) nogen // SIMS-based

save "$stata_data_sims/SIMS_wide.dta", replace
